The purpose of this document is to perform EDA on the clean_assessments.csv data to identify patterns in the data that will inform a model of sale price.

library(tidyverse)
library(lubridate)

library(sf)
library(leaflet)

library(skimr)
library(hrbrthemes)
library(janitor)
library(scales)

library(here)

#here::i_am("scripts/eda")

options(scipen = 999, digits = 4)

theme_set(theme_ipsum())

assessment_data_path <- here("data/cleaned/big", "clean_assessment_data_geocoded.csv")

unified_geo_ids_path <- here("data/cleaned/big/unified_geo_ids/unified_geo_ids.shp")

assessments_valid <- read_csv(assessment_data_path) %>% 
  mutate(sale_month = factor(sale_month, levels = month.abb))

geo_ids <- st_read(unified_geo_ids_path)
## Reading layer `unified_geo_ids' from data source `/Users/conortompkins/github_repos/allegheny_house_price_app/data/cleaned/big/unified_geo_ids/unified_geo_ids.shp' using driver `ESRI Shapefile'
## Simple feature collection with 80 features and 5 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -80.36 ymin: 40.19 xmax: -79.69 ymax: 40.67
## Geodetic CRS:  WGS 84

Get an idea of the data

glimpse(assessments_valid)
## Rows: 160,186
## Columns: 29
## $ par_id               <chr> "0001G00111000000", "0001G00224110600", "0001G002…
## $ usedesc              <chr> "SINGLE FAMILY", "CONDOMINIUM", "CONDOMINIUM", "C…
## $ muni_desc            <chr> "1st Ward  - Pittsburgh", "1st Ward  - Pittsburgh…
## $ sale_desc            <chr> "Valid Sale", "Valid Sale", "Valid Sale", "Other …
## $ sale_price           <dbl> 2000000, 333325, 400000, 387000, 425000, 245000, …
## $ year_built           <dbl> 2015, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2…
## $ style_desc           <chr> "Townhouse", "Condo", "Condo", "Condo", "Condo", …
## $ bedrooms             <dbl> 3, 2, 2, 2, 2, 1, 2, 1, 3, 2, 2, 1, 2, 1, 3, 2, 2…
## $ full_baths           <dbl> 3, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 2, 2, 1…
## $ half_baths           <dbl> 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1…
## $ finished_living_area <dbl> 2011, 1434, 1483, 1388, 1275, 889, 1666, 1107, 17…
## $ lot_area             <dbl> 995, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ grade_desc           <chr> "Excellent", "Very Good", "Very Good", "Very Good…
## $ condition_desc       <chr> "Excellent", "Average", "Average", "Average", "Av…
## $ extfinish_desc       <chr> "Brick", "Concrete", "Concrete", "Concrete", "Con…
## $ roof_desc            <chr> "Roll", "Roll", "Roll", "Roll", "Roll", "Roll", "…
## $ basement_desc        <chr> "Full", "None", "None", "None", "None", "None", "…
## $ cdu_desc             <chr> "Excellent", "Average", "Average", "Average", "Av…
## $ heating_cooling_desc <chr> "Central Heat With AC", "Central Heat With AC", "…
## $ fireplaces           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ basement_garage      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ sale_year            <dbl> 2018, 2007, 2013, 2009, 2019, 2013, 2014, 2008, 2…
## $ sale_month           <fct> Oct, Dec, Sep, Oct, Jun, Sep, Jun, Aug, Feb, May,…
## $ school_desc          <chr> "1st Ward  Pittsburgh", "1st Ward  Pittsburgh", "…
## $ house_age_at_sale    <dbl> 3, 0, 6, 2, 12, 6, 7, 1, 6, 9, 6, 10, 3, 9, 0, 3,…
## $ ac_flag              <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T…
## $ heat_type            <chr> "Central Heat", "Central Heat", "Central Heat", "…
## $ sale_price_adj       <dbl> 2036244, 410997, 438978, 461176, 425000, 268874, …
## $ geo_id               <chr> "City Council District 6", "City Council District…
skim(assessments_valid)
Data summary
Name assessments_valid
Number of rows 160186
Number of columns 29
_______________________
Column type frequency:
character 15
factor 1
logical 1
numeric 12
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
par_id 0 1 16 16 0 160181 0
usedesc 0 1 8 22 0 13 0
muni_desc 0 1 4 23 0 174 0
sale_desc 0 1 10 11 0 2 0
style_desc 0 1 3 13 0 20 0
grade_desc 0 1 4 13 0 7 0
condition_desc 22 1 4 9 0 8 0
extfinish_desc 1 1 3 14 0 8 0
roof_desc 212 1 4 7 0 6 0
basement_desc 30 1 4 10 0 5 0
cdu_desc 24 1 4 9 0 8 0
heating_cooling_desc 24 1 4 21 0 15 0
school_desc 0 1 6 20 0 78 0
heat_type 24 1 4 13 0 9 0
geo_id 0 1 7 23 0 61 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
sale_month 0 1 FALSE 12 Jun: 17138, Jul: 17079, Aug: 16741, May: 15342

Variable type: logical

skim_variable n_missing complete_rate mean count
ac_flag 24 1 0.64 TRU: 102989, FAL: 57173

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
sale_price 0 1.00 128739.87 125072.84 325.0 55000 92000 160000 3070000 ▇▁▁▁▁
year_built 0 1.00 1950.69 29.70 1797.0 1930 1953 1970 2019 ▁▁▅▇▃
bedrooms 7 1.00 3.06 0.88 0.0 3 3 4 14 ▂▇▁▁▁
full_baths 21 1.00 1.50 0.66 0.0 1 1 2 12 ▇▁▁▁▁
half_baths 1000 0.99 0.54 0.57 0.0 0 0 1 9 ▇▁▁▁▁
finished_living_area 0 1.00 1700.82 758.72 0.0 1188 1512 2028 12790 ▇▁▁▁▁
lot_area 0 1.00 13181.22 37628.87 0.0 3953 7500 12930 4859466 ▇▁▁▁▁
fireplaces 9064 0.94 0.42 0.56 0.0 0 0 1 7 ▇▁▁▁▁
basement_garage 4020 0.97 0.79 0.84 0.0 0 1 1 6 ▇▂▁▁▁
sale_year 0 1.00 2001.80 11.83 1976.0 1993 2003 2012 2020 ▃▅▆▇▇
house_age_at_sale 0 1.00 51.12 30.51 0.0 28 50 71 219 ▇▇▂▁▁
sale_price_adj 0 1.00 176801.24 146867.78 412.2 90107 140612 215593 3465024 ▇▁▁▁▁

Sales Price

Inflation-adjusted sales price (sale_price_adj) is normally distributed on the log10() scale, as expected.

assessments_valid %>% 
  ggplot(aes(sale_price_adj)) +
  geom_density() +
  scale_x_log10(labels = dollar)

Adjusting for inflation (sale_price_adj is 2019 dollars) removes a lot of the drift over time.

assessments_valid %>%
  select(sale_year, sale_price, sale_price_adj) %>% 
  pivot_longer(cols = contains("sale_price")) %>% 
  ggplot(aes(sale_year, value, color = name)) +
  geom_smooth() +
  scale_y_continuous(labels = dollar)

assessments_valid %>% 
  mutate(sale_month = fct_rev(sale_month)) %>% 
  add_count(sale_month) %>% 
  ggplot(aes(sale_month, sale_price_adj, group = sale_month, fill = n)) +
  geom_boxplot(outlier.alpha = 0,
               color = "grey") +
  scale_y_log10() +
  coord_cartesian(ylim = c(10^4, 10^6)) +
  scale_fill_viridis_c()

Location

The location of the house (geo_id) has a big impact on sale_price_adj.

geo_id_median_price <- assessments_valid %>% 
  group_by(geo_id) %>% 
  summarize(median_price = median(sale_price_adj))

pal <- colorNumeric(
  palette = "viridis",
  domain = geo_id_median_price$median_price)

geo_ids %>% 
  left_join(geo_id_median_price) %>% 
  leaflet() %>% 
  addProviderTiles(providers$Stamen.TonerLite,
                   options = providerTileOptions(noWrap = TRUE,
                                                 minZoom = 9, 
                                                 #maxZoom = 8
                   )) %>% 
  addPolygons(popup = ~ str_c(geo_id, " ", "median price: ", dollar(median_price), sep = ""),
              fillColor = ~pal(median_price),
              fillOpacity = .7,
              color = "black",
              weight = 3) %>% 
  addLegend("bottomright", pal = pal, values = ~median_price,
            title = "Median sales price",
            labFormat = labelFormat(prefix = "$"),
            opacity = 1)
assessments_valid %>%
  add_count(geo_id) %>% 
  mutate(geo_id = fct_reorder(geo_id, sale_price_adj, .fun = median)) %>% 
  ggplot(aes(sale_price_adj, geo_id, fill = n)) +
  geom_boxplot(outlier.alpha = 0,
               color = "grey") +
  scale_x_log10(label = dollar) +
  scale_fill_viridis_c()

House Characteristics

The age of the house at time of sale has a negative relationship with sale price, but is heteroskedastic. This indicates that some houses retain their value better over time (and/or that houses that don’t retain value aren’t bought and sold).

assessments_valid %>% 
  ggplot(aes(house_age_at_sale, sale_price_adj)) +
  geom_density_2d_filled() +
  scale_y_log10(labels = dollar)

Lot Area

There is not an obvious relationship between lot_area and sale_price_adj.

assessments_valid %>% 
  ggplot(aes(lot_area, sale_price_adj)) +
  geom_density_2d_filled() +
  scale_y_log10(labels = dollar) +
  coord_cartesian(xlim = c(0, 50000))

But, lot_area varies drastically across geo_id, so there might still be a useful feature to be engineered.

assessments_valid %>% 
  mutate(geo_id = fct_reorder(geo_id, lot_area, .fun = median)) %>% 
  ggplot(aes(lot_area, geo_id)) +
  geom_boxplot(outlier.alpha = 0) +
  coord_cartesian(xlim = c(0, 40000))

lot_area also varies across style_desc.

assessments_valid %>% 
  mutate(style_desc = fct_reorder(style_desc, lot_area, .fun = median)) %>% 
  ggplot(aes(lot_area, style_desc)) +
  geom_boxplot(outlier.alpha = 0) +
  coord_cartesian(xlim = c(0, 10^5))

Finished Living Area

finished_living_area will be a useful variable.

assessments_valid %>% 
  ggplot(aes(finished_living_area, sale_price_adj)) +
  geom_density_2d_filled() +
  scale_y_log10(labels = dollar) +
  coord_cartesian(xlim = c(0, 5000))

finished_living_area also varies across style_desc.

assessments_valid %>% 
  mutate(style_desc = fct_reorder(style_desc, finished_living_area, .fun = median)) %>% 
  ggplot(aes(finished_living_area, style_desc)) +
  geom_boxplot(outlier.alpha = 0) +
  coord_cartesian(xlim = c(0, 10^4))

finished_living_area also varies across geo_id, but not as drastically.

assessments_valid %>% 
  mutate(geo_id = fct_reorder(geo_id, finished_living_area, .fun = median)) %>% 
  ggplot(aes(finished_living_area, geo_id)) +
  geom_boxplot(outlier.alpha = 0) +
  coord_cartesian(xlim = c(0, 10000))

Grade

grade_desc has a clear relationship with sale_price_adj.

assessments_valid %>% 
  mutate(grade_desc = fct_reorder(grade_desc, sale_price_adj, median)) %>% 
  add_count(grade_desc) %>% 
  ggplot(aes(sale_price_adj, grade_desc, fill = n)) +
  geom_boxplot(outlier.alpha = 0,
               color = "grey") +
  scale_x_log10(labels = dollar) +
  scale_fill_viridis_c()

CDU

assessments_valid %>% 
  mutate(cdu_desc = fct_reorder(cdu_desc, sale_price_adj, .fun = median)) %>% 
  add_count(cdu_desc) %>% 
  ggplot(aes(sale_price_adj, cdu_desc,  fill = n)) +
  geom_boxplot(outlier.alpha = 0,
               color = "grey") +
  scale_x_log10(label = dollar) +
  scale_fill_viridis_c()

Condition

condition_desc also has a relationship, but some of the levels can probably be collapsed.

assessments_valid %>% 
  mutate(condition_desc = fct_reorder(condition_desc, sale_price_adj, median)) %>% 
  add_count(condition_desc) %>% 
  ggplot(aes(sale_price_adj, condition_desc, fill = n)) +
  geom_boxplot(outlier.alpha = 0,
               color = "grey") +
  scale_x_log10(labels = dollar) +
  scale_fill_viridis_c()

There are 4 main types of houses, with a lot of low-n types that can be collapsed.

assessments_valid %>% 
  count(style_desc) %>% 
  mutate(style_desc = fct_reorder(style_desc, n, .fun = max)) %>% 
  ggplot(aes(n, style_desc)) +
  geom_col()

There is some time-series pattern in when different types of houses were created.

assessments_valid %>% 
  add_count(style_desc) %>% 
  filter(n > 5000) %>% 
  mutate(style_desc = fct_reorder(style_desc, n, .fun = "max", .desc = T)) %>% 
  ggplot(aes(year_built, fill = style_desc)) +
  geom_histogram(binwdidth = 30) +
  guides(fill = FALSE) +
  facet_wrap(~style_desc, scales = "free_y")

Bathrooms

Most homes have between 1 and 2 full and half bathrooms.

assessments_valid %>% 
  count(full_baths, half_baths) %>% 
  complete(full_baths = 0:12, half_baths = 0:9, fill = list(n = 0)) %>% 
  ggplot(aes(full_baths, half_baths, fill = n)) +
  geom_tile() +
  scale_fill_viridis_c() +
  scale_x_continuous(breaks = c(0:12),
                     expand = c(0,0)) +
  scale_y_continuous(breaks = c(0:12),
                     expand = c(0,0)) +
  coord_equal() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

fullbaths and sale_price_adj are positively related.

assessments_valid %>% 
  add_count(full_baths) %>% 
  ggplot(aes(sale_price_adj, full_baths, fill = n, group = full_baths)) +
  geom_boxplot(outlier.alpha = 0,
               color = "grey") +
  scale_x_log10(label = dollar) +
  scale_y_continuous(breaks = c(0:12)) +
  scale_fill_viridis_c()

There appear to be diminishing returns on the number of half bathrooms.

assessments_valid %>% 
  add_count(half_baths) %>% 
  ggplot(aes(sale_price_adj, half_baths, fill = n, group = half_baths)) +
  geom_boxplot(outlier.alpha = 0,
               color = "grey") +
  scale_x_log10(label = dollar) +
  scale_y_continuous(breaks = c(0:9)) +
  scale_fill_viridis_c()

Heating and cooling

Need to split these out to heat_type and ac_flag.

assessments_valid %>% 
  mutate(heating_cooling_desc = fct_reorder(heating_cooling_desc, sale_price_adj, .fun = median)) %>% 
  add_count(heating_cooling_desc) %>% 
  ggplot(aes(sale_price_adj, heating_cooling_desc, fill = n)) +
  geom_boxplot(outlier.alpha = 0,
               color = "grey") +
  scale_x_log10(label = dollar) +
  scale_fill_viridis_c()

Impute missing based on mode for geo_id and style_desc.

assessments_valid %>% 
  mutate(heat_type = fct_reorder(heat_type, sale_price_adj, .fun = median)) %>% 
  add_count(heat_type) %>% 
  ggplot(aes(sale_price_adj, heat_type, fill = n)) +
  geom_boxplot(outlier.alpha = 0,
               color = "grey") +
  scale_x_log10(label = dollar) +
  scale_fill_viridis_c()

Impute missing based on mode for geo_id and style_desc.

assessments_valid %>% 
  mutate(ac_flag = as.character(ac_flag)) %>% 
  mutate(ac_flag = fct_reorder(ac_flag, sale_price_adj, .fun = median)) %>% 
  add_count(ac_flag) %>% 
  ggplot(aes(sale_price_adj, ac_flag, fill = n)) +
  geom_boxplot(outlier.alpha = 0,
               color = "grey") +
  scale_x_log10(label = dollar) +
  scale_fill_viridis_c()

Exterior

Impute missing based on mode for geo_id and style_desc.

assessments_valid %>% 
  mutate(extfinish_desc = fct_reorder(extfinish_desc, sale_price_adj, .fun = median)) %>% 
  add_count(extfinish_desc) %>% 
  ggplot(aes(sale_price_adj, extfinish_desc, fill = n)) +
  geom_boxplot(outlier.alpha = 0,
               color = "grey") +
  scale_x_log10(label = dollar) +
  scale_fill_viridis_c()

Roof

Impute missing based on mode for geo_id and style_desc.

assessments_valid %>% 
  mutate(roof_desc = fct_reorder(roof_desc, sale_price_adj, .fun = median)) %>% 
  add_count(roof_desc) %>% 
  ggplot(aes(sale_price_adj, roof_desc, fill = n)) +
  geom_boxplot(outlier.alpha = 0,
               color = "grey") +
  scale_x_log10(label = dollar) +
  scale_fill_viridis_c()

Basement

Impute missing based on mode for geo_id and style_desc.

assessments_valid %>% 
  mutate(basement_desc = fct_reorder(basement_desc, sale_price_adj, .fun = median)) %>% 
  add_count(basement_desc) %>% 
  ggplot(aes(sale_price_adj, basement_desc, fill = n)) +
  geom_boxplot(outlier.alpha = 0,
               color = "grey") +
  scale_x_log10(label = dollar) +
  scale_fill_viridis_c()

Basement garage

Impute missing based on mode for geo_id and style_desc. Need to look up what this column is. Number of cars that can fit in the basement garage?

assessments_valid %>% 
  select(sale_price_adj, basement_garage) %>% 
  mutate(basement_garage = as.character(basement_garage),
         basement_garage = fct_explicit_na(basement_garage),
         basement_garage = fct_reorder(basement_garage, sale_price_adj, .fun = median)) %>% 
  add_count(basement_garage) %>% 
  ggplot(aes(sale_price_adj, basement_garage, group = basement_garage, fill = n)) +
  geom_boxplot(outlier.alpha = 0,
               color = "grey") +
  scale_x_log10(label = dollar) +
  scale_fill_viridis_c()

Fireplaces

Positive relationship between number of fireplaces and sale price, but most houses have 1 or 2. Consider changing to lgl fireplace_flag column where it checks fireplaces >= 1.

assessments_valid %>% 
  add_count(fireplaces) %>% 
  ggplot(aes(sale_price_adj, fireplaces, group = fireplaces, fill = n)) +
  geom_boxplot(outlier.alpha = 0,
               color = "grey") +
  scale_x_log10(label = dollar) +
  scale_fill_viridis_c()

Identify UI cutoffs

assessments_valid %>% 
  ggplot(aes(lot_area)) +
  geom_density() +
  geom_vline(xintercept = 40000) +
  scale_x_log10()

assessments_valid %>% 
  ggplot(aes(finished_living_area)) +
  geom_density() +
  scale_x_log10() +
  geom_vline(xintercept = 10000)